import requests
import time
import os
from config import DATA_DIR,RESULT_DIR
import json
from tqdm import tqdm

os.makedirs(os.path.join(DATA_DIR,'raw'),exist_ok=True)
os.makedirs(os.path.join(DATA_DIR,'processed'),exist_ok=True)
os.makedirs(RESULT_DIR,exist_ok=True)

hurun_url = "https://www.hurun.net/zh-CN/Rank/HsRankDetailsList"
headers = {
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0"
}
total_pages=110

all_data=[]

print("开始爬取网页数据......")

for page in tqdm(range(total_pages), desc="爬取进度", unit="页"):
    offset=10*page
    params={
        "num":"ODBYW2BI",
        "search":"",
        "offset":offset,
        "limit":10
    }

    try:
        resp=requests.get(hurun_url,params=params,headers=headers)
        resp.raise_for_status()
        page_data=resp.json()
        all_data.extend(page_data.get('rows',[]))
        time.sleep(1)

    except requests.exceptions.RequestException as e:
        print(f"获取第{page+1}页数据时出错：{e}")
        continue

print(f"爬取完成！总共获取到 {len(all_data)} 条数据")

save_path=os.path.join(DATA_DIR,'raw','hurun_rich_list_2024.json')
with open(save_path, "w", encoding="utf-8") as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)
print(f"文件已保存至：{save_path}")